Vendor recommender - Collaborative Filtering outcomes

@olibolly

OUTCOMES

TO DO

  • TRAINING (1.9M rows) kernel crashed abover 20K -> Need to Map/Reduce or getting a higher performance machine or use another algorithm (matrix factorization)?
  • Think about scaling or binarizing the count data -> to improve results
  • Look at match between product service code (#5833) & vendors (#770526)
  • add FILTER: Geo
  • add FILTER: Already done business with a company?
  • Save outcome in output tables for front end Big query API access

In [ ]:
%%bq query -n df_query
select
contractingofficerbusinesssizedetermination,
mod_agency,
vendorname,
count(*) as count
from `fiery-set-171213.vrec.usa_spending_all`
where vendorcountrycode in ('UNITED STATES', 'USA: UNITED STATES OF AMERICA')
and contractingofficerbusinesssizedetermination in ('O: OTHER THAN SMALL BUSINESS', 'S: SMALL BUSINESS')
and mod_agency not in ("")
group by 1,2,3
order by count DESC
limit 20000

In [ ]:
df = df_query.execute(output_options=bq.QueryOutput.dataframe()).result()
df.head()

In [ ]:
df1 = df.drop('contractingofficerbusinesssizedetermination', axis = 1)
n_agency = df1.mod_agency.unique().shape[0]
n_vendors = df1.vendorname.unique().shape[0]
print 'Number of gov agency = ' + str(n_agency) + ' | Number of vendors = ' + str(n_vendors)

In [ ]:
# Convert categorial value with label encoding

le_agency = LabelEncoder()
label_agency = le_agency.fit_transform(df1['mod_agency'])

le_vendor = LabelEncoder()
label_vendor = le_vendor.fit_transform(df1['vendorname'])

df_agency = pd.DataFrame(label_agency)
df_vendor = pd.DataFrame(label_vendor)

df2 = pd.concat([df_agency, df_vendor], axis = 1)
df2 = pd.concat([df2, df1['count']], axis = 1)
df2.columns = ['mod_agency', 'vendorname', 'count']
df2.head(5)

# To ge the right label back
# le_agency.inverse_transform([173, 100])

In [ ]:
# Split into training and test data set
train_data, test_data = cv.train_test_split(df2, test_size=0.25)

In [ ]:
#Build the matrix
train_data_matrix = np.zeros((n_agency, n_vendors))
for line in train_data.itertuples():
  train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_agency, n_vendors))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

#Compute cosine distance
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred
  
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [ ]:
# Evaluation
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten() #filter out all items with no 0 as we only want to predict in the test set
    return sqrt(mean_squared_error(prediction, ground_truth))

print 'User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
print 'Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix))

In [ ]:
print 'Worklow 1'
print '=' * 100
print 'Select your agency:'
agency = df1['mod_agency'][10]
print agency
print '=' * 100
print '1. Have you considered working with these SMB companies (user prediction?'

#a = pd.Series(['7300: SMALL BUSINESS ADMINISTRATION', '1205: USDA, OFFICE OF THE CHIEF FINANCIAL OFFICER'])
#print a
#print type(a)
#le_agency.transform(df1['mod_agency'][:5])

agency = le_agency.transform(agency)
vendor_reco = pd.DataFrame(user_prediction[agency, :])
labels = pd.DataFrame(le_vendor.inverse_transform(range(0, len(agency_reco))))

df_reco = pd.concat([vendor_reco, labels], axis = 1)
df_reco.columns = ['reco_score', 'vendorname']
#Join to get the SMB list
df_smb = df.drop(['mod_agency', 'count'], axis = 1)
df_reco = df_reco.set_index('vendorname').join(df_smb.set_index('vendorname'))

df_reco = df_reco.sort_values(['reco_score'], ascending = [0])
df_reco[df_reco['contractingofficerbusinesssizedetermination'] == 'S: SMALL BUSINESS'].head(10)

In [ ]:
print '=' * 100
print '2. Have you considered working with these SMB companies (item-item prediction?'
vendor_reco = pd.DataFrame(item_prediction[agency, :])

df_reco = pd.concat([vendor_reco, labels], axis = 1)
df_reco.columns = ['reco_score', 'vendorname']
df_reco = df_reco.set_index('vendorname').join(df_smb.set_index('vendorname'))
df_reco = df_reco.sort_values(['reco_score'], ascending = [0])
df_reco[df_reco['contractingofficerbusinesssizedetermination'] == 'S: SMALL BUSINESS'].head(10)

In [ ]:
print 'Worklow 2'
print '=' * 100
print 'Select a vendor:'

# Workflow 2  - WIP
# Select a vendor
# Other similar vendor